#' ----------------------------------------------------------------------------
#' Merge CHES Global Datasets
#' 
#' Description: This script merges data from various CHES surveys (Europe,
#' Latin America, US, Australia, Israel, Canada) into a single global dataset.

# Packages ---------------------------------------------------------------------

# Install and load required packages
if (!require("pacman")) install.packages("pacman")

p_load(tidyverse,
       haven, 
       hbamr)

# Load and select data ---------------------------------------------------------

# Europe
df_eu <- haven::read_dta('data/ches_data/ches_eu_2019_expert_level.dta') %>%
  select(country_id,
         country_name,
         party_id,
         party_name,
         id,
         lrecon,
         party_a_econ,
         party_b_econ,
         party_c_econ,
         gender,
         dob,
         lrecon_self) %>%
  rename(expert_id = id,
         country = country_name,
         party = party_name,
         party_left = party_a_econ,
         party_right = party_b_econ,
         party_center = party_c_econ,
         expert_gender = gender,
         expert_year = dob,
         expert_lrgen = lrecon_self) %>%
  mutate(region_id = 1,
         expert_id = paste0(sprintf("%02d", country_id),
                            sprintf("%02d", expert_id)),
         party_id = as.character(party_id),
         party_left = party_left - 1,
         party_right = party_right - 1,
         party_center = party_center - 1,
         expert_lrgen = expert_lrgen - 1) # discuss with Ryan

# Latin America
df_la <- readRDS('data/ches_data/ches_la_2020_expert_level.rds') %>%
  ungroup() %>%
  mutate(region_id = 2) %>%
  select(region_id,
         country_id,
         country,
         party_id,
         party_abb,
         expert_id,
         lrecon,
         hypothetical_1,
         hypothetical_2,
         hypothetical_3,
         expert_gender,
         expert_year,
         expert_lrgen) %>%
  rename(party = party_abb,
         party_left = hypothetical_1,
         party_right = hypothetical_2,
         party_center = hypothetical_3)

# United States
df_us <- haven::read_dta('data/ches_data/ches_US_2021_expert_level.dta') %>%
  mutate(region_id = 3,
         country_id = 70,
         country = 'United States',
         party_id = paste0(country_id, sprintf("%02d", party)),
         party = as.character(as_factor(party)),
         expert_id = paste0(country_id, sprintf("%02d", id))) %>%
  select(region_id,
         country_id,
         country,
         party_id,
         party,
         expert_id,
         lrecon,
         Party_B_econ,
         Party_C_econ,
         Party_A_econ,
         gender,
         yob,
         lr_self) %>%
  rename(party_left = Party_B_econ,
         party_right = Party_C_econ,
         party_center = Party_A_econ,
         expert_gender = gender,
         expert_year = yob,
         expert_lrgen = lr_self)

# Australia
df_au <- haven::read_dta('data/ches_data/ches_austr_2023_expert_level.dta') %>%
  mutate(region_id = 4,
         country_id = 71,
         country = "Australia",
         party_id = paste0(country_id, sprintf("%02d", party)),
         expert_id = paste0(country_id, sprintf("%02d", id))) %>%
  select(region_id,
         country_id,
         country,
         party_id,
         party_name,
         expert_id,
         lrecon,
         party_b_econ,
         party_c_econ,
         party_a_econ,
         gender,
         yob,
         lrgen_self) %>%
  rename(party = party_name,
         party_left = party_b_econ,
         party_center = party_a_econ,
         party_right = party_c_econ,
         expert_gender = gender,
         expert_year = yob,
         expert_lrgen = lrgen_self)

# Israel
df_is <- haven::read_dta('data/ches_data/ches_israel_2022_expert_level.dta') %>%
  mutate(region_id = 5,
         country_id = 80,
         country = 'Israel',
         party_id = paste0(country_id, sprintf("%02d", party)),
         party = as.character(as_factor(party)),
         expert_id = paste0(country_id, sprintf("%02d", id))) %>%
  select(region_id,
         country_id,
         country,
         party_id,
         party,
         expert_id,
         lrecon,
         party_B_econ,
         party_C_econ,
         party_A_econ,
         gender,
         yob,
         lrecon_self) %>%
  rename(party_left = party_A_econ,
         party_right = party_B_econ,
         party_center = party_C_econ,
         expert_gender = gender,
         expert_year = yob,
         expert_lrgen = lrecon_self) %>%
  mutate(party_left = party_left - 1,
         party_right = party_right - 1,
         party_center = party_center - 1,
         expert_lrgen = expert_lrgen - 1)

# Canada
df_ca <- haven::read_dta('data/ches_data/ches_canada_2023_expert_level.dta') %>%
  mutate(region_id = 6,
         country_id = 90,
         country = "Canada",
         party_id = paste0(country_id, sprintf("%02d", party_id)),
         party = as.character(as_factor(party)),
         expert_id = paste0(country_id, sprintf("%02d", expert))) %>%
  select(region_id,
         country_id,
         country,
         party_id,
         party,
         expert_id,
         lrecon,
         vignette_A,
         vignette_B,
         vignette_C,
         expert_gender,
         expert_dob,
         expert_lrecon) %>%
  rename(party_left = vignette_A,
         party_right = vignette_B,
         party_center = vignette_C,
         expert_year = expert_dob,
         expert_lrgen = expert_lrecon)

# Append and reshape data ------------------------------------------------------

# Append
df_global <- bind_rows(df_au, df_eu, df_la, df_us, df_is, df_ca) %>%
  select(expert_id,
         region_id,
         country_id,
         country,
         party_id,
         party,
         party_left,
         party_center,
         party_right,
         lrecon,
         expert_gender,
         expert_year,
         expert_lrgen)

# Remove input files
rm(df_au, df_eu, df_la, df_us, df_is, df_ca)

# Check for issues: Missing Vignettes and Vignette Equivalence -----------------

# Vignette equivalence

## Total Experts
df_global %>%
  summarise(cases = length(unique(expert_id))) # 730

## Total of experts without any vignettes placements
df_global %>%
  filter(is.na(party_left) &
         is.na(party_center) &
         is.na(party_right)) %>%
  summarise(cases = length(unique(paste(country_id, expert_id)))) # 53

### Wrong vignette placements
df_global %>%
  filter(!(is.na(party_left) &
           is.na(party_center) &
           is.na(party_right))) %>%
  filter(ifelse((party_left <= party_center &
                 party_center <= party_right) |
                (is.na(party_left) |
                 is.na(party_center) |
                 is.na(party_right)),
              FALSE,
              TRUE)) %>%
  summarise(cases = length(unique(expert_id))) # 8

### Eliminate cases with all vignettes missing
df_global <- df_global %>%
  filter(!(is.na(party_left) &
           is.na(party_center) &
           is.na(party_right)))

## Delete cases with problems with vignette equivalence
df_global <- df_global %>%
  filter(ifelse((party_left <= party_center &
                 party_center <= party_right) |
                (is.na(party_left) |
                 is.na(party_center) |
                 is.na(party_right)),
              TRUE,
              FALSE))

# Reshape data to wide format for analysis -------------------------------------

# Reshape wide
df_wide <- df_global %>%
  pivot_wider(id_cols = c(region_id,
                          country_id,
                          country,
                          expert_id,
                          party_left,
                          party_center,
                          party_right,
                          expert_gender,
                          expert_year,
                          expert_lrgen),
              names_from = party_id,
              values_from = lrecon,
              names_prefix = 'lrecon_') %>%
  group_by(expert_id) %>%
  mutate(dupli = case_when(n() > 1 ~ 1, TRUE ~ 0),
         number = seq_len(n()),
         expert_id = ifelse(dupli == 1,
                            paste0(expert_id, "_", number),
                            expert_id)) %>% # Impute cases in Peru
  mutate_at(vars(party_left:expert_lrgen),
            function(x) ifelse(is.na(x) & 'dupli' == 1,
                               mean(x, na.rm = TRUE),
                               x)) %>%
  select(-dupli, -number) %>% # Reorder data
  select(region_id:expert_id,
         expert_gender:expert_lrgen,
         party_left:party_right,
         lrecon_7101:lrecon_9006)

# Convert all haven_labelled objects to regular numeric values
df_wide <- df_wide %>%
  mutate(across(starts_with("lrecon_"), ~as.numeric(zap_labels(.))))

# Subset: only cases used in the manuscript: 3 valid placements/2 unique
keep_cases <- prep_data(
  df_wide$expert_lrgen, # self-placements
  df_wide[, which(names(df_wide) == "party_left"):ncol(df_wide) - 1], 
  req_valid = 3, 
  req_unique = 2)$keep

# Subset Expert
df_wide <- df_wide[keep_cases == TRUE, ]

# Save data --------------------------------------------------------------------

# Save data in long and wide format
saveRDS(df_global, "outputs/df_global_expert_party.rds")
saveRDS(df_wide, "outputs/df_global_expert.rds")

# Save session information for reproducibility
writeLines(capture.output(sessionInfo()), "outputs/session_info/00_data.txt")